{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "WaIDlfzUJXjN",
   "metadata": {
    "id": "WaIDlfzUJXjN"
   },
   "source": [
    "# Putting the R(etrieval) in RAG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aGixSoAgIzLo",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "collapsed": true,
    "id": "aGixSoAgIzLo",
    "jupyter": {
     "outputs_hidden": true
    },
    "outputId": "72d55e5e-bd10-45df-d1b6-6b217e069d91"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting pinecone-client\n",
      "  Downloading pinecone_client-4.1.0-py3-none-any.whl (215 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.5/215.5 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting openai\n",
      "  Downloading openai-1.30.5-py3-none-any.whl (320 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.7/320.7 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: certifi>=2019.11.17 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (2024.2.2)\n",
      "Requirement already satisfied: tqdm>=4.64.1 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (4.66.4)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (4.11.0)\n",
      "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from pinecone-client) (2.0.7)\n",
      "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n",
      "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n",
      "Collecting httpx<1,>=0.23.0 (from openai)\n",
      "  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai) (2.7.1)\n",
      "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.1)\n",
      "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (3.7)\n",
      "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.1)\n",
      "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)\n",
      "  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)\n",
      "  Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai) (0.7.0)\n",
      "Requirement already satisfied: pydantic-core==2.18.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai) (2.18.2)\n",
      "Installing collected packages: pinecone-client, h11, httpcore, httpx, openai\n",
      "Successfully installed h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 openai-1.30.5 pinecone-client-4.1.0\n"
     ]
    }
   ],
   "source": [
    "!pip install pinecone-client openai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aLyppGK_hSi6",
   "metadata": {
    "id": "aLyppGK_hSi6"
   },
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "from pinecone import Pinecone, ServerlessSpec\n",
    "import hashlib\n",
    "from datetime import datetime\n",
    "\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0289c54",
   "metadata": {
    "id": "f0289c54"
   },
   "outputs": [],
   "source": [
    "# Retrieve the Pinecone API key from user data\n",
    "pinecone_key = userdata.get('PINECONE_API_KEY')\n",
    "\n",
    "# Initialize the OpenAI client with the API key from user data\n",
    "client = OpenAI(\n",
    "    api_key=userdata.get('OPENAI_API_KEY')\n",
    ")\n",
    "\n",
    "# Define constants for the Pinecone index, namespace, and engine\n",
    "INDEX_NAME = 'semantic-search-rag'  # The name of the Pinecone index\n",
    "NAMESPACE = 'default'  # The namespace to use within the index\n",
    "ENGINE = 'text-embedding-3-small'  # The embedding model to use (vector size 1,536)\n",
    "\n",
    "# Initialize the Pinecone client with the retrieved API key\n",
    "pc = Pinecone(\n",
    "    api_key=pinecone_key\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3cf993a3-6d51-49f4-8968-60f654d6202d",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "3cf993a3-6d51-49f4-8968-60f654d6202d",
    "outputId": "194a6da8-2de8-4ca8-d616-8cb1381a3154"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1536, 2)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Function to get embeddings for a list of texts using the OpenAI API\n",
    "def get_embeddings(texts, engine=ENGINE):\n",
    "    # Create embeddings for the input texts using the specified engine\n",
    "    response = client.embeddings.create(\n",
    "        input=texts,\n",
    "        model=engine\n",
    "    )\n",
    "\n",
    "    # Extract and return the list of embeddings from the response\n",
    "    return [d.embedding for d in list(response.data)]\n",
    "\n",
    "# Function to get embedding for a single text using the OpenAI API\n",
    "def get_embedding(text, engine=ENGINE):\n",
    "    # Use the get_embeddings function to get the embedding for a single text\n",
    "    return get_embeddings([text], engine)[0]\n",
    "\n",
    "# Test the functions by getting the length of a single embedding and a list of embeddings\n",
    "len(get_embedding('hi')), len(get_embeddings(['hi', 'hello']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea70672a",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ea70672a",
    "outputId": "07f43e03-87d2-4d8f-b593-f5bb89c211c1"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pinecone.data.index.Index at 0x7f3616443370>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "if INDEX_NAME not in pc.list_indexes().names():  # need to create the index\n",
    "    print(f'Creating index {INDEX_NAME}')\n",
    "    pc.create_index(\n",
    "        name=INDEX_NAME,  # The name of the index\n",
    "        dimension=1536,  # The dimensionality of the vectors for our OpenAI embedder\n",
    "        metric='cosine',  # The similarity metric to use when searching the index\n",
    "        spec=ServerlessSpec(\n",
    "            cloud='aws',\n",
    "            region='us-west-2'\n",
    "        )\n",
    "    )\n",
    "\n",
    "# Store the index as a variable\n",
    "index = pc.Index(name=INDEX_NAME)\n",
    "index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6103d6c",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b6103d6c",
    "outputId": "11547a08-9982-4cc9-8b08-ecc27111e1a8"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dimension': 1536,\n",
       " 'index_fullness': 0.0,\n",
       " 'namespaces': {},\n",
       " 'total_vector_count': 0}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index.describe_index_stats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f2fdfe7",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 35
    },
    "id": "7f2fdfe7",
    "outputId": "c2656ea0-6fca-4907-fa2e-b46e122e72a6"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "type": "string"
      },
      "text/plain": [
       "'ae76cc4dfd345ecaeea9b8ba0d5c3437'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def my_hash(s):\n",
    "    # Return the MD5 hash of the input string as a hexadecimal string\n",
    "    return hashlib.md5(s.encode()).hexdigest()\n",
    "\n",
    "my_hash('I love to hash it')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecd86f51",
   "metadata": {
    "id": "ecd86f51"
   },
   "outputs": [],
   "source": [
    "def prepare_for_pinecone(texts, engine=ENGINE, urls=None):\n",
    "    # Get the current UTC date and time\n",
    "    now = datetime.utcnow()\n",
    "\n",
    "    # Generate vector embeddings for each string in the input list, using the specified engine\n",
    "    embeddings = get_embeddings(texts, engine=engine)\n",
    "\n",
    "    # Create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding\n",
    "    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() function is used to generate the current UTC date and time\n",
    "    responses = [\n",
    "        (\n",
    "            my_hash(text),  # A unique ID for each string, generated using the my_hash() function\n",
    "            embedding,  # The vector embedding of the string\n",
    "            dict(text=text, date_uploaded=now)  # A dictionary of metadata, including the original text and the current UTC date and time\n",
    "        )\n",
    "        for text, embedding in zip(texts, embeddings)  # Iterate over each input string and its corresponding vector embedding\n",
    "    ]\n",
    "    if urls and len(urls) == len(texts):\n",
    "        for response, url in zip(responses, urls):\n",
    "            response[-1]['url'] = url\n",
    "\n",
    "    return responses\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c40d99a",
   "metadata": {
    "id": "4c40d99a"
   },
   "outputs": [],
   "source": [
    "texts = ['hi']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e1b73f3",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "3e1b73f3",
    "outputId": "9098032f-8b27-48c8-99cc-2500d2eec532"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ID:   49f68a5c8493ec2c0bf489821c21fc3b \n",
      "LEN:  1536 \n",
      "META: {'text': 'hi', 'date_uploaded': datetime.datetime(2024, 5, 29, 19, 23, 38, 727917)}\n"
     ]
    }
   ],
   "source": [
    "_id, embedding, metadata = prepare_for_pinecone(texts)[0]\n",
    "\n",
    "print('ID:  ',_id, '\\nLEN: ', len(embedding), '\\nMETA:', metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b49debd5",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b49debd5",
    "outputId": "4188bdb8-99d1-4537-891c-d8b52882ebe9"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ID:   49f68a5c8493ec2c0bf489821c21fc3b \n",
      "LEN:  1536 \n",
      "META: {'text': 'hi', 'date_uploaded': datetime.datetime(2024, 5, 29, 19, 23, 39, 50436), 'url': 'fake.url'}\n"
     ]
    }
   ],
   "source": [
    "urls = ['fake.url']\n",
    "_id, embedding, metadata = prepare_for_pinecone(texts, urls=urls)[0]\n",
    "\n",
    "print('ID:  ',_id, '\\nLEN: ', len(embedding), '\\nMETA:', metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf47aabd",
   "metadata": {
    "id": "bf47aabd"
   },
   "outputs": [],
   "source": [
    "def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False, urls=None):\n",
    "    # Call the prepare_for_pinecone function to prepare the input texts for indexing\n",
    "    total_upserted = 0\n",
    "    if not batch_size:\n",
    "        batch_size = len(texts)\n",
    "\n",
    "    _range = range(0, len(texts), batch_size)\n",
    "    for i in tqdm(_range) if show_progress_bar else _range:\n",
    "        text_batch = texts[i: i + batch_size]\n",
    "        if urls:\n",
    "            url_batch = urls[i: i + batch_size]\n",
    "            prepared_texts = prepare_for_pinecone(text_batch, urls=url_batch)\n",
    "        else:\n",
    "            prepared_texts = prepare_for_pinecone(text_batch)\n",
    "\n",
    "\n",
    "        # Use the upsert() method of the index object to upload the prepared texts to Pinecone\n",
    "        total_upserted += index.upsert(\n",
    "            vectors=prepared_texts,\n",
    "            namespace=namespace\n",
    "        )['upserted_count']\n",
    "\n",
    "\n",
    "    return total_upserted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b65cb2e1-f67d-4606-b0ff-c7b67aab04d8",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b65cb2e1-f67d-4606-b0ff-c7b67aab04d8",
    "outputId": "b14cdb39-45a3-47a0-83f8-c3fba92fb0b0"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dimension': 1536,\n",
       " 'index_fullness': 0.0,\n",
       " 'namespaces': {'default': {'vector_count': 1}},\n",
       " 'total_vector_count': 1}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Call the upload_texts_to_pinecone() function with the input texts\n",
    "upload_texts_to_pinecone(texts)\n",
    "\n",
    "index.describe_index_stats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f64fa0cb-d704-42ef-b06f-5ac882a3a55a",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "f64fa0cb-d704-42ef-b06f-5ac882a3a55a",
    "outputId": "0b6ec887-e74c-4648-adfc-8f6f4522619f"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['hi']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "V0XI6RAom-Ln",
   "metadata": {
    "id": "V0XI6RAom-Ln"
   },
   "outputs": [],
   "source": [
    "def query_from_pinecone(query, top_k=3, include_metadata=True):\n",
    "    # get embedding from THE SAME embedder as the documents\n",
    "    query_embedding = get_embedding(query, engine=ENGINE)\n",
    "\n",
    "    return index.query(\n",
    "      vector=query_embedding,\n",
    "      top_k=top_k,\n",
    "      namespace=NAMESPACE,\n",
    "      include_metadata=include_metadata   # gets the metadata (dates, text, etc)\n",
    "    ).get('matches')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84a0871f",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "84a0871f",
    "outputId": "df6cb931-9cbc-4d53-aa80-5fddfb9cd94a"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'id': '49f68a5c8493ec2c0bf489821c21fc3b',\n",
       "  'metadata': {'date_uploaded': '2024-05-29T19:03:42.652561', 'text': 'hi'},\n",
       "  'score': 0.80858922,\n",
       "  'values': []}]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# test that the index is empty\n",
    "query_from_pinecone('hello')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13cc8bdc",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "13cc8bdc",
    "outputId": "0b5c5582-32f8-4fcc-8aca-e9ac68e7c802"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{}"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hashlib\n",
    "\n",
    "def delete_texts_from_pinecone(texts, namespace=NAMESPACE):\n",
    "    # Compute the hash (id) for each text\n",
    "    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]\n",
    "\n",
    "    # The ids parameter is used to specify the list of IDs (hashes) to delete\n",
    "    return index.delete(ids=hashes, namespace=namespace)\n",
    "\n",
    "# delete our text\n",
    "delete_texts_from_pinecone(texts)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "074fab6f",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "074fab6f",
    "outputId": "e9086e2b-48dc-4848-9d15-7720033cd2d1"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dimension': 1536,\n",
       " 'index_fullness': 0.0,\n",
       " 'namespaces': {},\n",
       " 'total_vector_count': 0}"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index.describe_index_stats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c354983-ce8d-4818-98a0-e0e183909af6",
   "metadata": {
    "id": "3c354983-ce8d-4818-98a0-e0e183909af6"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4523cab6-0bc3-4791-9c73-7bd7d7e5858b",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "4523cab6-0bc3-4791-9c73-7bd7d7e5858b",
    "outputId": "e1241529-5887-491e-831a-d0b1e473d1cc"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://faq.ssa.gov/en-US/topic?id=CAT-01092\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['https://faq.ssa.gov/en-us/Topic/article/KA-01735',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02713',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02125',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02166',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02131',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02995',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02983',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02137',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02154',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02113',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02148',\n",
       " 'https://faq.ssa.gov/en-us/Topic/article/KA-02989']"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_url = 'https://faq.ssa.gov'\n",
    "medicare_faqs = base_url + '/en-US/topic?id=CAT-01092'\n",
    "print(medicare_faqs)\n",
    "\n",
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "\n",
    "# get all links from medicare_faqs\n",
    "urls = []\n",
    "r = requests.get(medicare_faqs)\n",
    "soup = BeautifulSoup(r.content, 'html.parser')\n",
    "for link in soup.find_all('a'):\n",
    "    if 'href' in link.attrs:\n",
    "        if link['href'].startswith('/') and 'article' in link['href']:\n",
    "            urls.append(base_url + link['href'])\n",
    "\n",
    "urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e622fbfa-2dde-4711-9c46-1390eb3430f9",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 196
    },
    "id": "e622fbfa-2dde-4711-9c46-1390eb3430f9",
    "outputId": "3f4b6ad7-bb23-4d9f-a99f-11f328adad1a"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 12/12 [00:32<00:00,  2.70s/it]\n"
     ]
    },
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "type": "string"
      },
      "text/plain": [
       "\"\\n\\n\\n\\nYou’re offline. This is a read only version of the page.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSkip to content\\n\\n\\n \\n\\n\\n\\n\\n\\n\\nDo you need to submit W-2s to SSA? Business Services Online registration has changed!\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nWhat should I do if I get a call claiming there's a problem with my Social Security number or account?\\n\\n\\n\\nSkip to main content Social Security Search  Menu  Español  Sign in\\n\\n\\n\\n\\nFrequently Asked Questions\\n\\n\\n\\n\\nLast Modified: \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nFAQ Home\\n\\n\\nTopics\\n\\n\\r\\n\\t\\t\\t\\t\\tKA-01735\\r\\n\\t\\t\\t\\t\\n\\n\\n\\n\\n\\n Print\\n\\n\\n\\nHow do I get a replacement Medicare card? \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nViews: \\n\\n\\n\\nIf your Medicare card was lost, stolen, or destroyed, you can request a replacement online at Medicare.gov.\\nYou can print an official copy of your card from your online Medicare account \\nor call 1-800-MEDICARE (1-800-633-4227 TTY 1-877-486-2048) to order a replacement card to be sent in the mail.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nComments (0)\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nFooter menu\\n\\n\\n\\n\\n\\n\\n\\nGive us Feedback.\\n\\nDid this answer your question?\\n\\nNo\\nYes\\nNo\\n\\nThanks for your feedback.\\n\\n\\n\\n\\n\\n\\n\""
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "texts = []\n",
    "for url in tqdm(urls):\n",
    "    r = requests.get(url)\n",
    "    soup = BeautifulSoup(r.content, 'html.parser')\n",
    "    body = soup.find('body').get_text()\n",
    "    # CLEAN YOUR DATA HERE :)\n",
    "    texts.append(body)\n",
    "\n",
    "texts[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0221343a",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "0221343a",
    "outputId": "72953aba-424b-4756-ea73-f5592ae7d7dd"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3/3 [00:04<00:00,  1.39s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "BATCH_SIZE = 4\n",
    "upload_texts_to_pinecone(texts, batch_size=BATCH_SIZE, urls=urls, show_progress_bar=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33yf7QrWtwt-",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "33yf7QrWtwt-",
    "outputId": "5ec56246-4d35-44bb-b958-00117ca150ee"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dimension': 1536,\n",
       " 'index_fullness': 0.0,\n",
       " 'namespaces': {'default': {'vector_count': 12}},\n",
       " 'total_vector_count': 12}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index.describe_index_stats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "gQdFZg1b0-eH",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "gQdFZg1b0-eH",
    "outputId": "821a37b1-d063-49e9-9134-4fe591bc5ab3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://faq.ssa.gov/en-us/Topic/article/KA-01735 0.646217287 \n",
      "\n",
      "\n",
      "\n",
      "You’re offline. This is a read only version of\n",
      "https://faq.ssa.gov/en-us/Topic/article/KA-02713 0.500928938 \n",
      "\n",
      "\n",
      "\n",
      "You’re offline. This is a read only version of\n",
      "https://faq.ssa.gov/en-us/Topic/article/KA-02113 0.500682116 \n",
      "\n",
      "\n",
      "\n",
      "You’re offline. This is a read only version of\n"
     ]
    }
   ],
   "source": [
    " results = query_from_pinecone('I lost my medicare card', top_k=3)\n",
    " for result in results:\n",
    "    print(result['metadata']['url'], result['score'], result['metadata']['text'][:50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "EswQgNRx0-pz",
   "metadata": {
    "id": "EswQgNRx0-pz"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "machine_shape": "hm",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
